The Atari Compendium

home *** CD-ROM | disk | FTP | other *** search

/ The Atari Compendium / The Atari Compendium (Toad Computers) (1994).iso / files / compress / arc.zoo / arclzw.c < prev next >

Wrap

C/C++ Source or Header | 1989-01-29 | 22KB | 805 lines

/* * $Header: arclzw.c,v 1.6 88/07/31 18:49:49 hyc Exp $ */ /* * ARC - Archive utility - ARCLZW * * Version 2.03, created on 10/24/86 at 11:46:22 * * (C) COPYRIGHT 1985,86 by System Enhancement Associates; ALL RIGHTS RESERVED * * By: Thom Henderson * * Description: This file contains the routines used to implement Lempel-Zev * data compression, which calls for building a coding table on the fly. * This form of compression is especially good for encoding files which * contain repeated strings, and can often give dramatic improvements over * traditional Huffman SQueezing. * * Language: Computer Innovations Optimizing C86 * * Programming notes: In this section I am drawing heavily on the COMPRESS * program from UNIX. The basic method is taken from "A Technique for High * Performance Data Compression", Terry A. Welch, IEEE Computer Vol 17, No 6 * (June 1984), pp 8-19. Also see "Knuth's Fundamental Algorithms", Donald * Knuth, Vol 3, Section 6.4. * * As best as I can tell, this method works by tracing down a hash table of code * strings where each entry has the property: * * if <string> <char> is in the table then <string> is in the table. */ #include <stdio.h> #include "arc.h" void putc_pak(), abort(), putc_ncr(); int getc_unp(); #if MSDOS char *setmem(); #else #ifndef __STDC__ char *memset(); #endif #endif static void putcode(); /* definitions for older style crunching */ #define FALSE 0 #define TRUE !FALSE #define TABSIZE 4096 #define NO_PRED 0xFFFF #define EMPTY 0xFFFF #define NOT_FND 0xFFFF static unsigned short inbuf; /* partial input code storage */ static int sp; /* current stack pointer */ struct entry { /* string table entry format */ char used; /* true when this entry is in use */ unsigned char follower; /* char following string */ unsigned short next; /* ptr to next in collision list */ unsigned short predecessor; /* code for preceeding string */ }; /* string_tab[TABSIZE]; the code string table */ /* definitions for the new dynamic Lempel-Zev crunching */ #define BITS 12 /* maximum bits per code */ #define HSIZE 5003 /* 80% occupancy */ #define INIT_BITS 9 /* initial number of bits/code */ static int n_bits; /* number of bits/code */ static int maxcode; /* maximum code, given n_bits */ #define MAXCODE(n) ((1<<(n)) - 1) /* maximum code calculation */ static int maxcodemax = 1 << BITS; /* largest possible code (+1) */ static char buf[BITS]; /* input/output buffer */ static unsigned char lmask[9] = /* left side masks */ { 0xff, 0xfe, 0xfc, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00 }; static unsigned char rmask[9] = /* right side masks */ { 0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff }; static int offset; /* byte offset for code output */ static long in_count; /* length of input */ static long bytes_out; /* length of compressed output */ static long bytes_ref; /* output quality reference */ static long bytes_last; /* output size at last checkpoint */ static unsigned short ent; /* * To save much memory (which we badly need at this point), we overlay the * table used by the previous version of Lempel-Zev with those used by the * new version. Since no two of these routines will be used together, we can * safely do this. */ extern long htab[HSIZE]; /* hash code table (crunch) */ extern unsigned short codetab[HSIZE]; /* string code table (crunch) */ static struct entry *string_tab=(struct entry *)htab; /* old crunch string table */ static unsigned short *prefix=codetab; /* prefix code table (uncrunch) */ static unsigned char *suffix=(unsigned char *)htab; /* suffix table (uncrunch) */ static int free_ent; /* first unused entry */ static int firstcmp; /* true at start of compression */ extern unsigned char stack[HSIZE]; /* local push/pop stack */ /* * block compression parameters -- after all codes are used up, and * compression rate changes, start over. */ static int clear_flg; #define CHECK_GAP 2048 /* ratio check interval */ static long checkpoint; void upd_tab(); /* * the next two codes should not be changed lightly, as they must not lie * within the contiguous general code space. */ #define FIRST 257 /* first free entry */ #define CLEAR 256 /* table clear output code */ /* * The cl_block() routine is called at each checkpoint to determine if * compression would likely improve by resetting the code table. The method * chosen to determine this is based on empirical observation that, in * general, every 2k of input data should compress at least as well as the * first 2k of input. */ static void cl_block(t) /* table clear for block compress */ FILE *t; /* our output file */ { checkpoint = in_count + CHECK_GAP; if (bytes_ref) { if (bytes_out - bytes_last > bytes_ref) { setmem(htab, HSIZE * sizeof(long), 0xff); free_ent = FIRST; clear_flg = 1; putcode(CLEAR, t); bytes_ref = 0; } } else bytes_ref = bytes_out - bytes_last; bytes_last = bytes_out; /* remember where we were */ } /***************************************************************** * * Output a given code. * Inputs: * code: A n_bits-bit integer. If == -1, then EOF. This assumes * that n_bits =< (LONG)wordsize - 1. * Outputs: * Outputs code to the file. * Assumptions: * Chars are 8 bits long. * Algorithm: * Maintain a BITS character long buffer (so that 8 codes will * fit in it exactly). When the buffer fills up empty it and start over. */ static void putcode(code, t) /* output a code */ int code; /* code to output */ FILE *t; /* where to put it */ { int r_off = offset; /* right offset */ int bits = n_bits; /* bits to go */ char *bp = buf; /* buffer pointer */ int n; /* index */ register int ztmp; if (code >= 0) { /* if a real code *//* Get to the first byte. */ bp += (r_off >> 3); r_off &= 7; /* * Since code is always >= 8 bits, only need to mask the * first hunk on the left. */ ztmp = (code << r_off) & lmask[r_off]; *bp = (*bp & rmask[r_off]) | ztmp; bp++; bits -= (8 - r_off); code >>= (8 - r_off); /* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */ if (bits >= 8) { *bp++ = code; code >>= 8; bits -= 8; } /* Last bits. */ if (bits) *bp = code; offset += n_bits; if (offset == (n_bits << 3)) { bp = buf; bits = n_bits; bytes_out += bits; do putc_pak(*bp++, t); while (--bits); offset = 0; } /* * If the next entry is going to be too big for the code * size, then increase it, if possible. */ if (free_ent > maxcode || clear_flg > 0) { /* * Write the whole buffer, because the input side * won't discover the size increase until after * it has read it. */ if (offset > 0) { bp = buf; /* reset pointer for writing */ bytes_out += n = n_bits; while (n--) putc_pak(*bp++, t); } offset = 0; if (clear_flg) { /* reset if clearing */ maxcode = MAXCODE(n_bits = INIT_BITS); clear_flg = 0; } else {/* else use more bits */ n_bits++; if (n_bits == BITS) maxcode = maxcodemax; else maxcode = MAXCODE(n_bits); } } } else { /* dump the buffer on EOF */ bytes_out += n = (offset + 7) / 8; if (offset > 0) while (n--) putc_pak(*bp++, t); offset = 0; } } /***************************************************************** * * Read one code from the standard input. If EOF, return -1. * Inputs: * cmpin * Outputs: * code or -1 is returned. */ static int getcode(f) /* get a code */ FILE *f; /* file to get from */ { int code; static int offset = 0, size = 0; int r_off, bits; unsigned char *bp = (unsigned char *) buf; if (clear_flg > 0 || offset >= size || free_ent > maxcode) { /* * If the next entry will be too big for the current code * size, then we must increase the size. This implies * reading a new buffer f